#!/usr/bin/perl
use POSIX;
use strict;
use Getopt::Long;
use Cwd 'abs_path';
use File::Copy;

###########
my $MIN_REGULATORY_SEQ_LEN=10;
my $MAX_REGULATORY_SEQ_LEN=100;
my $REGULATORY_SEQ_UPDOWN_RATIO=5;
my $RECOMMENDED_SEQLEN_PERCENTILE=0.975;

my $conservatoryDir=abs_path(".");
my $genomeDatabaseFileName = "genome_database.csv";
my $genome="";
my $family="";
my $force=0;
my $forceOrthology=0;
my $verbose=0;
my $help=0;
my $verbose=0;
my $genomeWasProcessed=0;
my $genomeDatabaseWasProcessed=0;
my $forceRun=0;
my $referenceGenomeName;
my $blastThreads=4;
my $blastProcesses=4;
my $totalProcesses = $blastThreads * $blastProcesses;
my $upstreamLength;
my $downstreamLength;
my $ignoreMissingFiles=0;
my $calculateRecommendedRegulatoryLen;


GetOptions ("conservatory-dir=s" => \$conservatoryDir,
			"genome-database=s" => \$genomeDatabaseFileName,
			"genome=s" => \$genome,
			"family=s" => \$family,
			"reference=s" => \$referenceGenomeName,
			"blast-threads=i" => \$blastThreads,
			"blast-processes=i" => \$blastProcesses,
			"force" => \$force,
			"force-orthology" => \$forceOrthology,
			"force-run" => \$forceRun,
			"ignore-missing-files" => \$ignoreMissingFiles,
			"verbose" => \$verbose,
			"help" => \$help) or die("Error in command line arguments\n");

if($referenceGenomeName eq "") {
	print "\n\nERROR: No reference genome provided (--reference).\n\n";
	$help =1;
}
if($genome ne "" && $family ne ""){
	print "\n\nERROR: Please select either a family or a genome to process.\n\n";
}

if($help) {
	print "Conservatory version 2.0.1\n\n";
	
	print "processGenomes\n\n";
	print "\t--conservatory-dir\tPath of the main conservatory directory. See README for directory structure. (DEFAULT: current directory)\n";
	print "\t--genome-database\t\tGenome database file name (DEFAULT: genome_database.csv).\n";
	print "\t--genome\t\tLimit genome processing to a specific genome (DEFAULT process all genomes). Incompatible with --family \n";
	print "\t--family\t\tLimit genome processing to a specific family (DEFAULT process all genomes). Incompatible with --genome \n";	
	print "\t--reference\t\tReference genome name.\n";
	print "\t--blast-threads\t\tNumber of threads to use (DEAFULT: 4). \n";
	print "\t--blast-processes\t\tNumber of blast processes to use (DEAFULT: 4). \n";
	print "\t--force\t\t\tForce rebuilding of all databases file (except orthology).\n";
	print "\t--force-orthology\tForce rebuilding orthology database.\n";
	print "\t--force-run\t\tDo not verify the status of the last run.\n\t\t\t\t[WARNING: This can cause data corruption if two processGenomes processes are starting in parallel.\n\t\t\t\tOnly use if you know what you are doing]\n";  
	print "\t--verbose\t\tOutput extra progress messages.\n";
	print "\t--help\t\t\tPrints this message.\n\n";
	exit(0);
}

#################### Set up files
my $genomedb_file = "$conservatoryDir/$genomeDatabaseFileName";
my $genomedb_tmp_file = $genomedb_file . ".tmp";

####### First, sanity checks. Check to see if blast is installed and if directory structure is OK.
####### 
die "ERROR: Cannot find file genome database file ($genomedb_file)\n" unless -e $genomedb_file;
die "ERROR: Conservatory directory structure at ($conservatoryDir) is corrupt\n" unless (-e "$conservatoryDir/genomes" && -e "$conservatoryDir/genomes/blastdb/" && -e "$conservatoryDir/scripts" && -e "$conservatoryDir/alignments");
my $blastp = `sh -c 'command -v blastp'`;
die "ERROR: Cannot find blast in path. Please make sure it is installed.\n" unless ($blastp ne "");  

my $bgzip = `sh -c 'command -v bgzip'`;
die "ERROR: Cannot find bgzip in path. Please make sure it is installed.\n" unless ($bgzip ne "");  

my $samtools = `sh -c 'command -v samtools'`;
die "ERROR: Cannot find samtools in path. Please make sure it is installed.\n" unless ($samtools ne "");  

my $agatCheck = `sh -c 'command -v agat_sp_extract_sequences.pl'`;
die "ERROR: Cannot find AGAT in path. Please make sure AGAT is installed.\n" unless ($agatCheck ne "");

my $seqkit = `sh -c 'command -v seqkit'`;
die "ERROR: Cannot find seqkit in path. Please make sure seqkit is installed.\n" unless ($seqkit ne "");

### Check all scripts are found
die "ERROR: Cannot find the buildOrthologDB executable in the scripts directory.\n" unless (-x "$conservatoryDir/scripts/buildOrthologDB");
die "ERROR: Cannot find the buildRegulatorySeqDB executable in the scripts directory.\n" unless (-x "$conservatoryDir/scripts/buildRegulatorySeqDB");
die "ERROR: Cannot find the buildFootprint executable in the scripts directory.\n" unless (-x "$conservatoryDir/scripts/buildFootprint");

### Check if another process is not running
if (-e $genomedb_tmp_file) {
	print localtime() . " : WARNING Another genome processing instance is running or last ran was aborted. Are you sure you want to run (y/[n])?";
	my $response = "y";
	if(!$forceRun) {
		$response = <STDIN>;
	}
	chomp $response;
	if($response eq "y") {
		unlink($genomedb_tmp_file);
	} else {
		die localtime () . " : ABORT\n";
	}
}
######## First, read genome database file and verify we have all the files.
########  locate the reference genome.
	
open(my $genomedb, "<", $genomedb_file);
my $header=<$genomedb>;
print localtime() . ": START Verifying the existance of all genome files.\n";
my $globalReferenceGenomeSpecies="";
my $globalReferenceGenomeFamily="";

my $foundError=0;
my $foundGenome=0;
while(my $curgenomeline= <$genomedb>) {
	chomp $curgenomeline;
	if(substr($curgenomeline,0,1) ne "#" ) { ### If not commented
		my ($curgenomeName, $curgenomeSpecies,  $curgenomeFamily, $curgenomeReference, $upstreamLength, $downstreamLength, $geneNameField, $geneProcessingRegEx, $gene2SpeciesIdentifier, $proteinProcessingRegEx) = split /,/, $curgenomeline;
		my $genomeFastaFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.fasta.gz";
		my $genomeFastaFileUncompressed = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.fasta";		
		my $genomeProteinFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.proteins.fasta";
		my $genomeGFFFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.genes.gff3";
		my $genomeTEFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.te.gff3";
		
		if(!( -e $genomeFastaFile)) {
			#### Maybe it is here but not compressed. check
			if( -e $genomeFastaFileUncompressed) {
				print localtime() . ": Genome file was found, but was not compressed. Zipping $genomeFastaFileUncompressed ...\n";
				system("bgzip -@ $totalProcesses -f $genomeFastaFileUncompressed");
			} else {
				$foundError=1;
				print localtime() . ": ERROR Cannot find genome fasta file ($genomeFastaFile)\n";
			}
		}
		if($genome ne "" && $genome eq $curgenomeName) {
			$foundGenome=1;
		}
		
		if(!(-e $genomeProteinFile)) {
			$foundError=1;
			print localtime() . ": ERROR Cannot find proteins fasta file ($genomeFastaFile)\n";
		}
		if(!(-e $genomeGFFFile)) {
			$foundError=1;
			print localtime() . ": ERROR Cannot find genome GFF3 file ($genomeGFFFile)\n";
		}

		if($curgenomeName eq $referenceGenomeName) {

			if($globalReferenceGenomeSpecies ne "") {
				print localtime() . ": ERROR Only one reference genome allowed.\n";
				$foundError=1;
			} else {
				$globalReferenceGenomeSpecies = $curgenomeSpecies;
				$globalReferenceGenomeFamily = $curgenomeFamily;
			}
		}
	}
}

if($globalReferenceGenomeFamily eq "") {
	$foundError=1;
	print localtime() . ": ERROR Cannot file reference genome $referenceGenomeName in genome database file.\n";
}
if($genome ne "" && !$foundGenome) {
	$foundError=1;
	print localtime() . ": ERROR Cannot file genome $genome in genome database file.\n";	
}
if($foundError && !$ignoreMissingFiles) {
	die localtime() . ": ERROR Cannot locate all genome files. ABORT execution.\n";
} else {
	print localtime() . ": END All genome files verified.\n";
}
close($genomedb);

##### Now, process the genomes
print localtime() . ": START Begin processing genomes.\n";

open($genomedb, "<", $genomedb_file);
open(my $genomedbProcess, ">", $genomedb_tmp_file);

$header=<$genomedb>;
print $genomedbProcess $header;

while(my $curgenomeline= <$genomedb>) {
	chomp $curgenomeline;
	if(substr($curgenomeline,0,1) ne "#" ) { ### If not commented
		my ($curgenomeName, $curgenomeSpecies,  $curgenomeFamily, $curgenomeReference, $upstreamLength, $downstreamLength, $geneNameField, $geneProcessingRegEx, $gene2SpeciesIdentifier, $proteinProcessingRegEx, $clade) = split /,/, $curgenomeline;
		my $curgenomeReferenceFamily = $curgenomeFamily;
	
		if(($genome ne "" && $genome eq $curgenomeName) || ($family ne "" && $family eq $curgenomeFamily) || ($genome eq "" && $family eq "")) {
			$genomeWasProcessed=1;

			print localtime() . ": START processing genome $curgenomeName\n";
			my $genomeFastaFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.fasta.gz";
			my $genomeFastaFileUncompressed = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.fasta";
			my $genomeProteinFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.proteins.fasta";
			my $genomeGFFFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.genes.gff3";
			my $genomeTEFile = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.te.gff3";
			
			$curgenomeReference = $referenceGenomeName;
			$curgenomeReferenceFamily = $globalReferenceGenomeFamily;
		
			### First, make sure we have the blast database for the reference genome
			if((! -e "$conservatoryDir/genomes/blastdb/$curgenomeReference.phr" || $force)) {
				my $genomeReferenceProteinFile = "$conservatoryDir/genomes/$curgenomeReferenceFamily/$curgenomeReference.proteins.fasta";
				print localtime() . ": $curgenomeName : START building protein blast database for reference genome\n";
				system("makeblastdb -in $genomeReferenceProteinFile -input_type fasta -dbtype prot -parse_seqids -out \"$conservatoryDir/genomes/blastdb/$curgenomeReference\" >/dev/null 2>&1 ");
				print localtime() . ": $curgenomeName : END building protein blast database for reference genome\n";		
			} 

			########### Now process processing genome.
			#######  Stage 1: Make blast database
		 
			if(($curgenomeName ne $curgenomeReference) && !( ( -e "$conservatoryDir/genomes/blastdb/$curgenomeName.phr") || $force)) {
				print localtime() . ": $curgenomeName : START building protein blast database\n";
				system("makeblastdb -in $genomeProteinFile -input_type fasta -dbtype prot -out \"$conservatoryDir/genomes/blastdb/$curgenomeName\" >>processGenome.log 2>&1 ");
				print localtime() . ": $curgenomeName : END building protein blast database\n";
			} else { print localtime() . ": SKIP Blast database already exists (use --force to rebuild)\n"; }
				
			########### Stage 2: Build GFF footprintfile
			(my $genomeFootprintGFFFile = $genomeGFFFile) =~ s/genes.gff3/footprint.gff3/;
			if(! -e $genomeFootprintGFFFile || $force) {
				print localtime() . ": $curgenomeName : START building footprint database\n";
				system("perl $conservatoryDir/scripts/buildFootprint --conservatoryDirectory $conservatoryDir --genome-database $genomeDatabaseFileName --genome $curgenomeName");
				die localtime() . ": ABORT Internal error. Can not run buildFootprint script!\n" unless -r $genomeFootprintGFFFile;
				print localtime() . ": $curgenomeName : END building footprint database\n";
			} else { print localtime() . ": SKIP Footprint database already exists (use --force to rebuild)\n"; }
		
			if( $upstreamLength == 0 || $downstreamLength ==0 | $force) {
				#### check the optimized regulatory sequence lengths
				#### scan the footprint file to find 97.5% intergenic region
				my $old=0, my @allSpaces;
				open(my $footprintFile, "<$genomeFootprintGFFFile");
				while(my $footprintLine = <$footprintFile>) {
					chomp $footprintLine;
					(my $chr, my $name, my $tag, my $start, my $end) = split /\t/, $footprintLine;
					if($start-$old >0) { push @allSpaces, $start-$old; }
					$old=$end;
				}
				my @sortedSpaces = sort { $a <=> $b } @allSpaces;
				# calculate recommended regulatory sequence lengths
				my $recommendedUpstream, my $recommendedDownstream;
				my $per95thIntergenic = ceil($sortedSpaces[ (scalar @sortedSpaces)*$RECOMMENDED_SEQLEN_PERCENTILE]/1000);
				if ($per95thIntergenic > $MAX_REGULATORY_SEQ_LEN) {
					$recommendedUpstream = $MAX_REGULATORY_SEQ_LEN;
					$recommendedDownstream = $MAX_REGULATORY_SEQ_LEN/$REGULATORY_SEQ_UPDOWN_RATIO;
				} elsif ($per95thIntergenic < $MIN_REGULATORY_SEQ_LEN) {
					$recommendedUpstream = $MIN_REGULATORY_SEQ_LEN;
					$recommendedDownstream = $MIN_REGULATORY_SEQ_LEN/$REGULATORY_SEQ_UPDOWN_RATIO;
				} else {
					$recommendedUpstream = $per95thIntergenic;
					$recommendedDownstream = ceil($per95thIntergenic/$REGULATORY_SEQ_UPDOWN_RATIO);
				}
				print localtime() . ": $curgenomeName : $RECOMMENDED_SEQLEN_PERCENTILE percentile intergenic region legnth is $per95thIntergenic kb\n";
				print localtime() . ": $curgenomeName : Recommended $recommendedUpstream kb upstream and $recommendedDownstream kb downstream\n";
				close($footprintFile);
				## Now update the genome Database file
				$upstreamLength = $recommendedUpstream;
				$downstreamLength = $recommendedDownstream;
				$genomeDatabaseWasProcessed=1;
			}

			###### Build chromosome size file
			my $chromosomeSizeFileName = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.chrom.size";
			if( (! -e $chromosomeSizeFileName || $force) && $curgenomeName eq $curgenomeReference) {
				print localtime() . ": $curgenomeName : START building chromosome size file.\n";
				system("samtools faidx $genomeFastaFile");
				open(my $chromosomeSizeFile, ">", $chromosomeSizeFileName );
				open(my $chromosomeIndexFile, "<", "$genomeFastaFile.fai");
				while(my $line = <$chromosomeIndexFile>) {
					my ($chr, $length) = split /\t/, $line;
					print $chromosomeSizeFile "$chr\t" . ($length+100) . "\n";
				}
				close($chromosomeSizeFile);
				close($chromosomeIndexFile);
				#system("cut -f1-2 $genomeFastaFile.fai >$chromosomeSizeFileName");
				print localtime() . ": $curgenomeName : END building chromosome size file.\n";
			}		

			######### Stage 3: Build regulatory sequence database
			my $upstreamDB = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.upstream.fasta";
			my $downstreamDB = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.downstream.fasta";
			my $cdsDB = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.cds.fasta";
			my $firstIntronDB = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeName.introns.fasta";
			## Introns would be here...
			my $teOption="";
			if(-e $genomeTEFile) { $teOption = $genomeTEFile; }
		
			if(! -e "$upstreamDB.gz" || $force) {
				if( -e "$upstreamDB.gz") { ## If it is found, delete old indices
					if(-e "$upstreamDB.gz") { unlink("$upstreamDB.gz"); }
					if(-e "$upstreamDB.gz.fai") { unlink("$upstreamDB.gz.fai"); }
					if(-e "$upstreamDB.gz.gzi") { unlink("$upstreamDB.gz.gzi"); }
				}
				print localtime() . ": $curgenomeName : START building upstream sequence database.\n";
				system("perl $conservatoryDir/scripts/buildRegulatorySeqDB upstream $genomeFastaFile $genomeFootprintGFFFile " . 1000*$upstreamLength . " $upstreamDB $teOption");
				
				die localtime() . ": ABORT Internal error. Can not run buildRegulatorySeqDB script!\n" unless -r $upstreamDB;
			
				# now zip and index
				system("bgzip -@ $totalProcesses -f $upstreamDB");
				system("samtools faidx $upstreamDB.gz");
			
				print localtime() . ": $curgenomeName : END building upstream sequence database.\n";
			}  else { print localtime() . ": SKIP Upstream sequence database already exists (use --force to rebuild)\n"; }

			if(! -e "$downstreamDB.gz" || $force) {
				if( -e $downstreamDB) { ## If it is found, delete old indices
					if(-e "$downstreamDB.gz") { unlink("$downstreamDB.gz"); }
					if(-e "$downstreamDB.gz.fai") { unlink("$downstreamDB.gz.fai"); }
					if(-e "$downstreamDB.gz.gzi") { unlink("$downstreamDB.gz.gzi"); }
				}
				print localtime() . ": $curgenomeName : START building downstream sequence database.\n";
				system("perl $conservatoryDir/scripts/buildRegulatorySeqDB downstream $genomeFastaFile $genomeFootprintGFFFile " . 1000*$downstreamLength . " $downstreamDB $teOption");
				die localtime() . ": ABORT Internal error. Can not run buildRegulatorySeqDB script!\n" unless -r $downstreamDB;
				# now zip and index
				system("bgzip -@ $totalProcesses -f $downstreamDB");
				system("samtools faidx $downstreamDB.gz");
				print localtime() . ": $curgenomeName : END building downstream sequence database.\n";			
			}  elsif ($verbose) { print localtime() . ": SKIP Downstream sequence database already exists (use --force to rebuild)\n"; }
			
			#### Make sure we have CDS file if this is the reference
			if(($curgenomeFamily eq $globalReferenceGenomeFamily) && (!(-e $cdsDB) || $force)) {
				### openup the genome
				print localtime() . ": $curgenomeName : START building CDS sequence database.\n";
				
				system("gunzip -k $genomeFastaFile");
				#### AGAT doesn't like single line fasta files
				system("mv $genomeFastaFileUncompressed $genomeFastaFileUncompressed.tmp");
				system("seqkit seq -w 80 $genomeFastaFileUncompressed.tmp > $genomeFastaFileUncompressed");
				unlink("$genomeFastaFileUncompressed.tmp");
				unlink("$genomeFastaFileUncompressed.index");
				
				system("agat_sp_extract_sequences.pl --gff $genomeGFFFile --fasta $genomeFastaFileUncompressed -t cds -o $cdsDB");
				unlink($genomeFastaFileUncompressed);

				print localtime() . ": $curgenomeName : END building CDS sequence database.\n";
			}
		
			######## Stage 4: Perform reciprocal blast
			#make room for blast temporary files
			mkdir "$conservatoryDir/genomes/tmpblast/";
			my $orthologFilename = "$conservatoryDir/genomes/$curgenomeFamily/$curgenomeReference.$curgenomeName.orthologs.csv";
		
			my $blastG2REF = "$conservatoryDir/genomes/tmpblast/$curgenomeName" . "2" ."$curgenomeReference.txt";
			my $blastREF2G = "$conservatoryDir/genomes/tmpblast/$curgenomeReference" . "2" ."$curgenomeName.txt";

			if(! (-e $orthologFilename) || $forceOrthology) {
				if(! ((-e $blastG2REF) || (-e "$blastG2REF.gz")))  {
					print localtime() . ": $curgenomeName : START reciprocal blast (genome -> reference).\n";
					system("perl scripts/fblastp --genomeA $curgenomeReference --genomeB $curgenomeName --out $blastG2REF --blast-threads $blastThreads --blast-processes $blastProcesses");
					system("pigz --force $blastG2REF");
					print localtime() . ": $curgenomeName : END reciprocal blast (genome -> reference).\n";
				}
				if(! ((-e $blastREF2G) || (-e "$blastREF2G.gz"))) {
					print localtime() . ": $curgenomeName : START reciprocal blast (reference -> genome).\n";
					system("perl scripts/fblastp --genomeB $curgenomeReference --genomeA $curgenomeName --out $blastREF2G --blast-threads $blastThreads --blast-processes $blastProcesses");
					system("pigz --force $blastREF2G");	
					print localtime() . ": $curgenomeName : END reciprocal blast (reference -> genome).\n";
				}

				print localtime() . ": $curgenomeName : START Building ortholog database.\n";
				
				system("perl $conservatoryDir/scripts/buildOrthologDB --conservatoryDirectory $conservatoryDir --genome-database $genomeDatabaseFileName --genome $curgenomeName --reference $curgenomeReference");

				print localtime() . ": $curgenomeName : END Building ortholog database.\n";
			} else { print localtime(). ": SKIP Ortholog database already exists (use --force-orthology to rebuild)\n"; }
			print localtime() . ": END processing genome $curgenomeName\n-------------------------------------------------------------------------------------------------------------\n";
		}
		print $genomedbProcess "$curgenomeName,$curgenomeSpecies,$curgenomeFamily,,$upstreamLength,$downstreamLength,$geneNameField,$geneProcessingRegEx,$gene2SpeciesIdentifier,$proteinProcessingRegEx,$clade\n";
	} else {  print $genomedbProcess $curgenomeline . "\n"; } ### Spit out comments
}
close($genomedb);
close($genomedbProcess);

if(! $genomeWasProcessed) {
	if($genome ne "") { die ("ERROR: No genome $genome was found in $genomedb_file.\n"); }
	else { die ("ERROR: Genome database file is empty!\n"); }
}
if($genomeDatabaseWasProcessed) {
	print localtime() . ": Updating genome database file.\n";
	copy($genomedb_tmp_file, $genomedb_file);
}
unlink($genomedb_tmp_file);

print localtime() . ": END genome processing run.\n";
